File set-up

Set working directory to current directory

if (rstudioapi::isAvailable()) {
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}

Load standard libraries and resolves conflicts

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(conflicted)
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package
conflict_prefer("select", "dplyr")
## [conflicted] Will prefer dplyr::select over any other package
conflict_prefer("slice", "dplyr")
## [conflicted] Will prefer dplyr::slice over any other package
conflict_prefer("rename", "dplyr")
## [conflicted] Will prefer dplyr::rename over any other package
conflict_prefer('intersect', 'dplyr')
## [conflicted] Will prefer dplyr::intersect over any other package

Read data

table with all measured Cq values

cq = read_tsv("../data/details/selected_circ_cq.txt")
## Rows: 1560 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (10): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (15): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cq

Label each circ for three methods

qPCR validation

cq = cq %>%
  # qPCR_val is 'pass' for all circ that have at least one replicate > 10
  mutate(qPCR_val = ifelse((Cq_min_untreated < 10) & (Cq_max_untreated < 10), 'fail', 'pass'),
         qPCR_val = ifelse((is.na(Cq_min_untreated)) & (is.na(Cq_max_untreated)), 'fail', qPCR_val),
         # qPCR_val_detail describes the reason why a circ fails this validation step if it does
         qPCR_val_detail = ifelse((Cq_min_untreated < 10) & (Cq_max_untreated < 10), 'all_Cq_under_10', 'pass'),
         qPCR_val_detail = ifelse(is.na(Cq_min_untreated) & is.na(Cq_max_untreated), 'all_NA', qPCR_val_detail))

cq

statistics

cq %>% count(qPCR_val)
cq %>% count(qPCR_val_detail)

RR val

cq = cq %>%
  # calculate delta cq
  # first make temp 47 for NAs
  mutate(Cq_min_treated_tmp = Cq_min_treated,
         Cq_min_treated = ifelse(is.na(Cq_min_treated), 47, Cq_min_treated),
         Cq_diff = Cq_min_treated - Cq_max_untreated) %>% 
  mutate(# fail if diff > 3
         RR_val = ifelse(Cq_diff > 3, "fail", 'pass'),
         RR_val_detail = ifelse(RR_val == 'fail', "FP (Cq_diff > 3)", 'pass'),
         # when Cq untreated is high => put NA
         RR_val = ifelse(Cq_min_untreated > 32, NA, RR_val),
         RR_val_detail = ifelse(Cq_min_untreated > 32, 'out_of_range', RR_val_detail),
         # if qPCR already failed => RR also becomes 'fail'
         RR_val = ifelse(qPCR_val == "fail", 'fail', RR_val),
         RR_val_detail = ifelse(qPCR_val == "fail", "fail_qPCR", RR_val_detail)) %>%
  # clean up NAs for 47
  mutate(Cq_diff = ifelse(Cq_min_treated == 47, NA, Cq_diff),
         Cq_min_treated = Cq_min_treated_tmp) %>%
  select(-Cq_min_treated_tmp) %>%
  #' to clean up dataframe: also remove Cq_diff for 'all_Cq_under_10' group and out_of_range group
  mutate(Cq_diff = ifelse(qPCR_val_detail == 'all_Cq_under_10', NA, Cq_diff),
         Cq_diff = ifelse(RR_val_detail == 'out_of_range', NA, Cq_diff))

cq

statistics

cq %>% count(RR_val)
cq %>% count(RR_val_detail)

Amplicon sequencing

cq_amp = read_tsv('../data/details/amp_seq.txt')
## Rows: 1216 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): circ_id, cell_line, amp_seq_val, amp_seq_val_detail
## dbl (1): perc_on_target
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

statistics

cq_amp %>% count(amp_seq_val)
cq_amp %>% count(amp_seq_val_detail)

add to cq dataframe

cq = cq %>% 
  left_join(cq_amp %>% unique()) %>%
  mutate(amp_seq_val_detail = ifelse(is.na(amp_seq_val_detail), "not_included", amp_seq_val_detail))
## Joining, by = c("circ_id", "cell_line")

statistics

cq %>% count(amp_seq_val)
cq %>% count(amp_seq_val_detail)

Add a compound validation ‘decision’ for each circ

cq = cq %>%
  mutate(RR_val_tmp = ifelse(RR_val_detail == 'out_of_range', 'fail', RR_val)) %>% # when circ cannot be measured for RR => count as fail
  mutate(compound_val = ifelse(qPCR_val == "pass" & RR_val_tmp == "pass" & amp_seq_val == "pass", "pass", "NA"),
         compound_val = ifelse(qPCR_val == 'fail' | RR_val_tmp == "fail" | amp_seq_val == 'fail', 'fail', compound_val),
         compound_val = ifelse(is.na(amp_seq_val), NA, compound_val)) %>% 
  select(-RR_val_tmp)

statistics

cq %>% count(compound_val)

Add circRNA annotation and save as Supplementary Table 3

all_circ = read_tsv("../data/Supplementary_Table_2_all_circRNAs.txt")
## Rows: 1137099 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (16): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group...
## dbl  (7): start, end, BSJ_count, n_detected, n_db, estim_len_in, BSJ_count_m...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_circ
cq = cq %>%
  left_join(all_circ) 
## Joining, by = c("chr", "start", "end", "strand", "circ_id", "circ_id_strand",
## "tool", "cell_line", "BSJ_count", "count_group")
cq %>%
  write_tsv('../data/Supplementary_Table_3_selected_circRNAs.txt')

Calculate val rates (precision) per method per tool

perc_val = cq %>% 
  select(tool, circ_id, cell_line, count_group, qPCR_val, RR_val, RR_val_detail, amp_seq_val, amp_seq_val_detail, compound_val) %>%
  group_by(tool, count_group) %>%
  summarise(nr_qPCR_total = n(),
            nr_qPCR_fail = sum(qPCR_val == 'fail'),
            nr_qPCR_val = sum(qPCR_val == 'pass'),
            nr_RR_total = n() - sum(is.na(RR_val)),  # here NA are the ones that have are 'out_of_range'
            nr_RR_fail = sum(RR_val == "fail", na.rm = T),
            nr_RR_val = sum(RR_val == "pass", na.rm = T),
            nr_amp_total = n() - sum(is.na(amp_seq_val)),  # here NA are the ones 'not_included'
            nr_amp_fail = sum(amp_seq_val == "fail", na.rm = T),
            nr_amp_val = sum(amp_seq_val == "pass", na.rm = T),
            nr_compound_total = n() - sum(is.na(amp_seq_val)), # here NA are the ones 'not_included'
            nr_compound_fail = sum(compound_val == "fail", na.rm = T),
            nr_compound_val = sum(compound_val == "pass", na.rm = T)) %>%
  mutate(perc_qPCR_val = nr_qPCR_val/nr_qPCR_total,
         perc_RR_val = nr_RR_val/nr_RR_total,
         perc_amp_val = nr_amp_val/nr_amp_total,
         perc_compound_val = nr_compound_val/nr_compound_total) %>%
  ungroup()
## `summarise()` has grouped output by 'tool'. You can override using the
## `.groups` argument.
perc_val

Save dataframe (Sup Table 6A)

perc_val %>% write_tsv('../data/Supplementary_Table_6A_precision_values.txt')

Calculate sensitivity

Based on validated set (n = 657)

first calculate the total nr of validated circ per count group

nr_val = cq %>% 
  # get set of uniquely validated circRNAs
  filter(compound_val == 'pass') %>%
  select(circ_id, cell_line, count_group_median) %>% unique() %>%
  count(count_group_median) %>%
  rename(nr_expected = n)

nr_val

then calculate sensitivity by dividing nr of circ found by total

sens = cq %>% 
  # get set of uniquely validated circRNAs
  filter(compound_val == 'pass') %>%
  select(circ_id, cell_line, count_group_median) %>% unique() %>%
  # check witch tools have detected these
  left_join(all_circ %>%
              select(tool, circ_id, cell_line) %>% unique()) %>%
  group_by(tool, count_group_median) %>% 
  summarise(nr_detected = n()) %>%
  left_join(nr_val) %>%
  mutate(sensitivity = nr_detected/nr_expected) %>%
  ungroup()
## Joining, by = c("circ_id", "cell_line")
## `summarise()` has grouped output by 'tool'. You can override using the
## `.groups` argument.
## Joining, by = "count_group_median"
sens

Based on theoretical nr of TPs => extrapolated sensitivity

sens_2 = perc_val %>% 
  # use perc_compound_val
  select(tool, count_group, perc_compound_val) %>%
  # get the number of detected circRNAs for each cell line and tool, per count group
  left_join(all_circ %>%
              group_by(tool, count_group) %>%
              summarise(total_n = n())) %>%
  # calculate the theoretical nr of validated circ
  mutate(extrapolated_sensitivity = perc_compound_val * total_n) 
## `summarise()` has grouped output by 'tool'. You can override using the
## `.groups` argument.
## Joining, by = c("tool", "count_group")
sens_2

take both types of sensitivity together and fix Sailfish-circ count group thing

sens = sens %>% 
  filter(!tool == 'Sailfish-cir') %>% 
  full_join(sens_2 %>% filter(!tool == 'Sailfish-cir'),
            by = c('count_group_median' = 'count_group', 'tool')) %>%
  bind_rows(sens %>% 
              filter(tool == 'Sailfish-cir') %>%
              full_join(sens_2 %>% filter(tool == 'Sailfish-cir') %>%
                        select(-count_group),
                        by = c('tool')))

sens

Save dataframe (Sup Table 6B)

sens %>% write_tsv('../data/Supplementary_Table_6B_sensitivity_values.txt')

Make dataframe with all info per tool and ranking

val_sens_df = perc_val %>%
  select(tool, count_group, perc_qPCR_val, perc_RR_val, perc_amp_val, perc_compound_val) %>%
  full_join(sens %>% rename(count_group = count_group_median) %>%
              select(-nr_detected, -nr_expected, -total_n) %>%
              mutate(count_group = ifelse(tool == "Sailfish-cir", 'no_counts', count_group))) %>%
  full_join(all_circ %>%
              group_by(tool, count_group) %>% count()) %>%
  group_by(count_group) %>%
  mutate(qPCR_precision_rank = dense_rank(desc(perc_qPCR_val)),
         RR_precision_rank = dense_rank(desc(perc_RR_val)),
         amp_precision_rank = dense_rank(desc(perc_amp_val)),
         compound_precision_rank = dense_rank(desc(perc_compound_val)),
         sensitivity_rank = dense_rank(desc(sensitivity)),
         nr_circ_rank = dense_rank(desc(n)),
         extrapolated_sensitivity_rank = dense_rank(desc(extrapolated_sensitivity))) %>%
  ungroup() %>%
  filter(!is.na(n)) %>% 
  select(tool, count_group, n,  nr_circ_rank, perc_qPCR_val, qPCR_precision_rank,
         perc_RR_val, RR_precision_rank, perc_amp_val, amp_precision_rank,
         perc_compound_val, compound_precision_rank, sensitivity, sensitivity_rank,
         extrapolated_sensitivity, extrapolated_sensitivity_rank) %>%
  arrange(desc(count_group), tool) %>%
  rename(nr_circ_detected = n, qPCR_precision = perc_qPCR_val,
         RR_precision = perc_RR_val, amp_precision = perc_amp_val,
         compound_precision = perc_compound_val) %>% 
  mutate(across(c(qPCR_precision, RR_precision, amp_precision,
                  compound_precision, sensitivity), round, 4),
         extrapolated_sensitivity = round(extrapolated_sensitivity, 0))
## Joining, by = c("tool", "count_group", "perc_compound_val")
## Joining, by = c("tool", "count_group")
val_sens_df
val_sens_df %>%
  write_tsv('../data/Supplementary_Table_6_tool_ranking.txt')

RNase R enrichment calculations

calculate enrichment

all_circ_treated = read_tsv('../data/Supplementary_Table_4_all_circRNAs_treated.txt')
## Rows: 1694840 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (14): chr, strand, cell_line, RNaseR, tool, circ_id, circ_id_strand, cou...
## dbl  (4): start, end, BSJ_count, estim_len_in
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
treatment = all_circ %>%
  rename(count_UT = BSJ_count) %>% 
  select(chr, start, end, strand, cell_line, tool, circ_id, circ_id_strand, count_UT, count_group) %>%
  full_join(all_circ_treated %>% 
              rename(count_T = BSJ_count) %>%
              select(chr, start, end, strand, cell_line, tool, circ_id, circ_id_strand, count_T))
## Joining, by = c("chr", "start", "end", "strand", "cell_line", "tool",
## "circ_id", "circ_id_strand")
treatment
treatment = treatment %>%
  left_join(read_tsv('../data/details/nr_reads.txt') %>% 
              select(-RNA_id) %>%
              pivot_wider(names_from = RNaseR, values_from = nr_reads) %>%
              rename(nr_reads_T = treated, nr_reads_UT = untreated)) %>%
  # calculate CPMs and enrichment factor (Sailfish-cir is already TPM)
  mutate(cpm_T = ifelse(tool == 'Sailfish-cir', count_T, 1000000 * count_T/nr_reads_T), 
         cpm_UT = ifelse(tool == 'Sailfish-cir', count_UT, 1000000 * count_UT/nr_reads_UT),
         enrichment = cpm_T/cpm_UT,
         enrichment_bin = ifelse(enrichment > 1, 'enriched', 'not enriched'),
         enrichment_bin = ifelse(is.na(count_T), 'count treated NA', enrichment_bin),
         enrichment_bin = ifelse(is.na(count_UT), 'count untreated NA', enrichment_bin))
## Rows: 6 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): cell_line, RNaseR, RNA_id
## dbl (1): nr_reads
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining, by = "cell_line"
treatment %>% write_tsv('../data/Supplementary_Table_5_RNase_R_enrichment_seq.txt')

Get all stats

Per tool val rates => stats

RT-qPCR precision

perc_val %>%
  filter(count_group == 'count ≥ 5') %>%
  pull(perc_qPCR_val) %>%
  quantile()
##     0%    25%    50%    75%   100% 
## 0.9000 0.9875 0.9875 1.0000 1.0000
perc_val %>%
  filter(count_group == 'count < 5') %>%
  pull(perc_qPCR_val) %>%
  quantile()
##   0%  25%  50%  75% 100% 
## 0.80 0.90 0.95 1.00 1.00

RNase R precision

perc_val %>%
  filter(count_group == 'count ≥ 5') %>%
  pull(perc_RR_val) %>%
  quantile()
##        0%       25%       50%       75%      100% 
## 0.7402597 0.9491237 0.9625000 0.9808504 1.0000000
perc_val %>%
  filter(count_group == 'count < 5') %>%
  pull(perc_RR_val) %>%
  quantile()
##        0%       25%       50%       75%      100% 
## 0.5000000 0.8000000 0.8666667 0.9411765 1.0000000

amplicon sequencing precision

perc_val %>%
  filter(count_group == 'count ≥ 5') %>%
  pull(perc_amp_val) %>%
  quantile()
##        0%       25%       50%       75%      100% 
## 0.3000000 0.9329323 0.9552239 0.9843137 1.0000000
perc_val %>%
  filter(count_group == 'count < 5') %>%
  pull(perc_amp_val) %>%
  quantile()
##        0%       25%       50%       75%      100% 
## 0.1764706 0.6111111 0.7333333 0.8666667 0.9411765

compound precision

perc_val %>%
  filter(count_group == 'count ≥ 5') %>%
  pull(perc_compound_val) %>%
  quantile()
##        0%       25%       50%       75%      100% 
## 0.2714286 0.9047379 0.9310345 0.9527530 0.9830508
perc_val %>%
  filter(count_group == 'count < 5') %>%
  pull(perc_compound_val) %>%
  quantile()
##         0%        25%        50%        75%       100% 
## 0.05882353 0.53846154 0.63636364 0.76470588 0.88235294

sensitivity

sens %>%
  filter(!count_group_median == 'count ≥ 5') %>%
  filter(!tool == 'circRNA_finder',
         !tool == 'segemehl') %>%
  pull(sensitivity) %>%
  quantile()
##        0%       25%       50%       75%      100% 
## 0.1868132 0.3736264 0.6565934 0.7362637 0.8736264
sens %>%
  filter(!count_group_median == 'count < 5') %>%
  pull(sensitivity) %>%
  quantile()
##        0%       25%       50%       75%      100% 
## 0.2967742 0.4819355 0.7509677 0.8609677 0.8709677

Overall validation rates (ignoring tools) - all circ

RT-qPCR

cq %>%
  select(circ_id, cell_line, qPCR_val) %>% unique() %>%
  count(qPCR_val)
round(1479/(1479+37), digits = 3)
## [1] 0.976

RNase R

cq %>%
  select(circ_id, cell_line, RR_val) %>% unique() %>%
  count(RR_val)
round(1319/(1319+85), digits = 3)
## [1] 0.939
round(112/(1319+85+112), digits = 3)
## [1] 0.074

amplicon sequencing

cq %>%
  select(circ_id, cell_line, amp_seq_val) %>% unique() %>%
  count(amp_seq_val)
cq %>%
  select(circ_id, cell_line, amp_seq_val_detail) %>% unique() %>%
  count(amp_seq_val_detail)
round(1014/(1014+165), digits = 3)
## [1] 0.86
round(337/(1014+165+337), digits = 3)
## [1] 0.222

Overall validation rates (ignoring tools) - low-abundant circ

RT-qPCR

cq %>%
  filter(count_group == 'count < 5') %>%
  select(circ_id, cell_line, qPCR_val) %>% unique() %>%
  count(qPCR_val)
round(243/(243+17), digits = 3)
## [1] 0.935

RNase R

cq %>%
  filter(count_group == 'count < 5') %>%
  select(circ_id, cell_line, RR_val) %>% unique() %>%
  count(RR_val)
round(165/(165+25), digits = 3)
## [1] 0.868

amplicon sequencing

cq %>%
  filter(count_group == 'count < 5') %>%
  select(circ_id, cell_line, amp_seq_val) %>% unique() %>%
  count(amp_seq_val)
round((63+136)/(63+136+61), digits = 3)
## [1] 0.765
round(136/(63+136), digits = 3)
## [1] 0.683

Calculate perc of circRNAs upsetplot

nr of unique circ that pass all val methods

cq %>%
  select(circ_id, qPCR_val, RR_val, amp_seq_val, cell_line) %>%
  unique() %>%
  filter(qPCR_val == 'pass', RR_val == 'pass', amp_seq_val == 'pass')
round(957/1516, digits = 3)
## [1] 0.631

nr of unique circ that fail all val methods

cq %>%
  select(circ_id, qPCR_val, RR_val, amp_seq_val, cell_line) %>%
  unique() %>%
  filter(qPCR_val == 'fail', RR_val == 'fail', amp_seq_val == 'fail')
round(18/1516, digits = 3)
## [1] 0.012

nr of unique circ that fail 1 or 2 val methods

cq %>%
  select(circ_id, qPCR_val, RR_val, amp_seq_val, cell_line) %>%
  filter(!is.na(RR_val), !is.na(amp_seq_val)) %>%
  unique() %>%
  mutate(val_nr = str_count(paste(qPCR_val, RR_val, amp_seq_val),
                              pattern = 'pass')) %>%
  #filter(val_nr == 0)
  #filter(val_nr == 3)
  filter(val_nr < 3, val_nr > 0)
round(128/1516, digits = 3)
## [1] 0.084

nr of cicrc that have at least one NA

cq %>%
  select(circ_id, qPCR_val, RR_val, amp_seq_val, cell_line) %>%
  unique() %>%
  mutate(NA_nr = str_count(paste(qPCR_val, RR_val, amp_seq_val),
                            pattern = 'NA')) %>%
  #filter(NA_nr == 0)
  #filter(NA_nr == 3)
  filter(NA_nr > 0)
round(413/1516, digits = 3)
## [1] 0.272

nr of unique circ that have no NAs

cq %>% 
  filter(!is.na(RR_val), !is.na(amp_seq_val)) %>%
  select(circ_id, qPCR_val, RR_val, amp_seq_val, cell_line)  %>%
  unique()

=> recalculate previous numbers based on no NAs all pass

round(957/1103, digits = 3)
## [1] 0.868

all fail

round(18/1103, digits = 3)
## [1] 0.016

one or two fail

round(128/1103, digits = 3)
## [1] 0.116